{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "# given nn_file to run not know a lot will study\n",
    "from keras.models import Model, load_model\n",
    "from keras.layers import Input, Dropout, Dense, Embedding, SpatialDropout1D, concatenate, BatchNormalization, Flatten\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.preprocessing import text, sequence\n",
    "from keras.callbacks import Callback\n",
    "from keras import backend as K\n",
    "from keras.models import Model\n",
    "from keras.losses import mean_squared_error as mse_loss\n",
    "\n",
    "from keras import optimizers\n",
    "from keras.optimizers import RMSprop, Adam\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "import datetime\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('../Large_output/train_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reduce_mem_usage(df):\n",
    "    start_mem_usg = df.memory_usage().sum() / 1024**2 \n",
    "    print(\"Memory usage of properties dataframe is :\",start_mem_usg,\" MB\")\n",
    "    NAlist = [] # Keeps track of columns that have missing values filled in. \n",
    "    for col in df.columns:\n",
    "        if df[col].dtype != object:  # Exclude strings            \n",
    "            # Print current column type\n",
    "            print(\"******************************\")\n",
    "            print(\"Column: \",col)\n",
    "            print(\"dtype before: \",df[col].dtype)            \n",
    "            # make variables for Int, max and min\n",
    "            IsInt = False\n",
    "            mx = df[col].max()\n",
    "            mn = df[col].min()\n",
    "            print(\"min for this col: \",mn)\n",
    "            print(\"max for this col: \",mx)\n",
    "            # Integer does not support NA, therefore, NA needs to be filled\n",
    "            if not np.isfinite(df[col]).all(): \n",
    "                NAlist.append(col)\n",
    "                df[col].fillna(mn-1,inplace=True)  \n",
    "                   \n",
    "            # test if column can be converted to an integer\n",
    "            asint = df[col].fillna(0).astype(np.int64)\n",
    "            result = (df[col] - asint)\n",
    "            result = result.sum()\n",
    "            if result > -0.01 and result < 0.01:\n",
    "                IsInt = True            \n",
    "            # Make Integer/unsigned Integer datatypes\n",
    "            if IsInt:\n",
    "                if mn >= 0:\n",
    "                    if mx < 255:\n",
    "                        df[col] = df[col].astype(np.uint8)\n",
    "                    elif mx < 65535:\n",
    "                        df[col] = df[col].astype(np.uint16)\n",
    "                    elif mx < 4294967295:\n",
    "                        df[col] = df[col].astype(np.uint32)\n",
    "                    else:\n",
    "                        df[col] = df[col].astype(np.uint64)\n",
    "                else:\n",
    "                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:\n",
    "                        df[col] = df[col].astype(np.int8)\n",
    "                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:\n",
    "                        df[col] = df[col].astype(np.int16)\n",
    "                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:\n",
    "                        df[col] = df[col].astype(np.int32)\n",
    "                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:\n",
    "                        df[col] = df[col].astype(np.int64)    \n",
    "            # Make float datatypes 32 bit\n",
    "            else:\n",
    "                df[col] = df[col].astype(np.float32)\n",
    "            \n",
    "            # Print new column type\n",
    "            print(\"dtype after: \",df[col].dtype)\n",
    "            print(\"******************************\")\n",
    "    # Print final result\n",
    "    print(\"___MEMORY USAGE AFTER COMPLETION:___\")\n",
    "    mem_usg = df.memory_usage().sum() / 1024**2 \n",
    "    print(\"Memory usage is: \",mem_usg,\" MB\")\n",
    "    print(\"This is \",100*mem_usg/start_mem_usg,\"% of the initial size\")\n",
    "    return df, NAlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def features_engineering(df):\n",
    "    \n",
    "    # Sort by localtime\n",
    "    df.sort_values(\"local_time\")\n",
    "    df.reset_index(drop=True)\n",
    "    \n",
    "    # Add more features\n",
    "    df[\"local_time\"] = pd.to_datetime(df[\"local_time\"],format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    df[\"hour\"] = df[\"local_time\"].dt.hour\n",
    "    df[\"weekend\"] = df[\"local_time\"].dt.weekday\n",
    "    df['square_feet'] =  np.log1p(df['square_feet'])\n",
    "    \n",
    "    \n",
    "    # Encode Categorical Data\n",
    "    le = LabelEncoder()\n",
    "    df[\"primary_use\"] = le.fit_transform(df[\"primary_use\"])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of properties dataframe is : 2271.9295196533203  MB\n",
      "******************************\n",
      "Column:  building_id\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  1448\n",
      "dtype after:  uint16\n",
      "******************************\n",
      "******************************\n",
      "Column:  meter\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  3\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "******************************\n",
      "Column:  meter_reading\n",
      "dtype before:  float64\n",
      "min for this col:  0.0\n",
      "max for this col:  880374.0\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  site_id\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  15\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "******************************\n",
      "Column:  square_feet\n",
      "dtype before:  int64\n",
      "min for this col:  283\n",
      "max for this col:  875000\n",
      "dtype after:  uint32\n",
      "******************************\n",
      "******************************\n",
      "Column:  year_built\n",
      "dtype before:  float64\n",
      "min for this col:  1900.0\n",
      "max for this col:  2017.0\n",
      "dtype after:  uint16\n",
      "******************************\n",
      "******************************\n",
      "Column:  floor_count\n",
      "dtype before:  float64\n",
      "min for this col:  1.0\n",
      "max for this col:  26.0\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "******************************\n",
      "Column:  air_temperature\n",
      "dtype before:  float64\n",
      "min for this col:  -28.9\n",
      "max for this col:  47.2\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  cloud_coverage\n",
      "dtype before:  float64\n",
      "min for this col:  0.0\n",
      "max for this col:  9.0\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  dew_temperature\n",
      "dtype before:  float64\n",
      "min for this col:  -35.0\n",
      "max for this col:  26.1\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  precip_depth_1_hr\n",
      "dtype before:  float64\n",
      "min for this col:  -1.0\n",
      "max for this col:  343.0\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  is_holiday\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  1\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "___MEMORY USAGE AFTER COMPLETION:___\n",
      "Memory usage is:  1060.2338409423828  MB\n",
      "This is  46.66666953225585 % of the initial size\n"
     ]
    }
   ],
   "source": [
    "df_train, NA_list = reduce_mem_usage(df_train)\n",
    "train_engineer = features_engineering(df_train)\n",
    "train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\\\n",
    "=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)\n",
    "target = np.log1p(train_engineer[\"meter_reading\"])\n",
    "features = train_engineer[['building_id', 'meter','site_id','primary_use', \n",
    "                          'square_feet','air_temperature','cloud_coverage',\n",
    "                          'dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]\n",
    "del df_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "categoricals = [\"building_id\", \"site_id\", \"meter\", \"primary_use\",  \"weekend\",'is_holiday','hour']\n",
    "\n",
    "numericals = ['square_feet','air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr']\n",
    "\n",
    "feat_cols = categoricals + numericals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model(dense_dim_1=64, dense_dim_2=32, dense_dim_3=32, dense_dim_4=16, \n",
    "dropout1=0.2, dropout2=0.1, dropout3=0.1, dropout4=0.1, lr=0.001):\n",
    "\n",
    "    #Inputs\n",
    "    site_id = Input(shape=[1], name=\"site_id\")\n",
    "    building_id = Input(shape=[1], name=\"building_id\")\n",
    "    meter = Input(shape=[1], name=\"meter\")\n",
    "    primary_use = Input(shape=[1], name=\"primary_use\")\n",
    "    square_feet = Input(shape=[1], name=\"square_feet\")\n",
    "    air_temperature = Input(shape=[1], name=\"air_temperature\")\n",
    "    cloud_coverage = Input(shape=[1], name=\"cloud_coverage\")\n",
    "    dew_temperature = Input(shape=[1], name=\"dew_temperature\")\n",
    "    hour = Input(shape=[1], name=\"hour\")\n",
    "    precip = Input(shape=[1], name=\"precip_depth_1_hr\")\n",
    "    weekend = Input(shape=[1], name=\"weekend\")\n",
    "    is_holiday = Input(shape=[1], name=\"is_holiday\")\n",
    "   \n",
    "    #Embeddings layers\n",
    "    emb_site_id = Embedding(16, 2)(site_id)\n",
    "    emb_building_id = Embedding(1449, 6)(building_id)\n",
    "    emb_meter = Embedding(4, 2)(meter)\n",
    "    emb_primary_use = Embedding(16, 2)(primary_use)\n",
    "    emb_hour = Embedding(24, 3)(hour)\n",
    "    emb_weekend= Embedding(7, 2)(weekend)\n",
    "    emb_is_holiday= Embedding(2, 2)(is_holiday)\n",
    "\n",
    "    concat_emb = concatenate([\n",
    "           Flatten() (emb_site_id)\n",
    "         , Flatten() (emb_building_id)\n",
    "         , Flatten() (emb_meter)\n",
    "         , Flatten() (emb_primary_use)\n",
    "         , Flatten() (emb_hour)\n",
    "         , Flatten() (emb_weekend)\n",
    "         , Flatten() (emb_is_holiday)\n",
    "    ])\n",
    "    \n",
    "    categ = Dropout(dropout1)(Dense(dense_dim_1,activation='relu') (concat_emb))\n",
    "    categ = BatchNormalization()(categ)\n",
    "    categ = Dropout(dropout2)(Dense(dense_dim_2,activation='relu') (categ))\n",
    "    \n",
    "    #main layer\n",
    "    main_l = concatenate([\n",
    "          categ\n",
    "        , square_feet\n",
    "        , air_temperature\n",
    "        , cloud_coverage\n",
    "        , dew_temperature\n",
    "        , precip\n",
    "    ])\n",
    "    \n",
    "    main_l = Dropout(dropout3)(Dense(dense_dim_3,activation='relu') (main_l))\n",
    "    main_l = BatchNormalization()(main_l)\n",
    "    main_l = Dropout(dropout4)(Dense(dense_dim_4,activation='relu') (main_l))\n",
    "    \n",
    "    #output\n",
    "    output = Dense(1) (main_l) \n",
    "\n",
    "    model = Model([ site_id,\n",
    "                    building_id, \n",
    "                    meter, \n",
    "                    primary_use, \n",
    "                    square_feet, \n",
    "                    air_temperature,\n",
    "                    cloud_coverage,\n",
    "                    dew_temperature, \n",
    "                    hour,\n",
    "                    weekend, \n",
    "                    precip,\n",
    "                    is_holiday], output)\n",
    "\n",
    "    model.compile(optimizer = Adam(lr=lr),\n",
    "                  loss= mse_loss,\n",
    "                  metrics=[root_mean_squared_error])\n",
    "    return model\n",
    "\n",
    "def root_mean_squared_error(y_true, y_pred):\n",
    "    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_keras_data(df, num_cols, cat_cols):\n",
    "    cols = num_cols + cat_cols\n",
    "    X = {col: np.array(df[col]) for col in cols}\n",
    "    return X\n",
    "\n",
    "def train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold, patience=3):\n",
    "    early_stopping = EarlyStopping(patience=patience, verbose=1)\n",
    "    model_checkpoint = ModelCheckpoint(\"model_\" + str(fold) + \".hdf5\",\n",
    "                                       save_best_only=True, verbose=1, monitor='val_root_mean_squared_error', mode='min')\n",
    "\n",
    "    hist = keras_model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs,\n",
    "                            validation_data=(X_v, y_valid), verbose=1,\n",
    "                            callbacks=[early_stopping, model_checkpoint])\n",
    "\n",
    "    keras_model = load_model(\"model_\" + str(fold) + \".hdf5\", custom_objects={'root_mean_squared_error': root_mean_squared_error})\n",
    "    \n",
    "    return keras_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold: 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 13234529 samples, validate on 6617893 samples\n",
      "Epoch 1/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.4571 - root_mean_squared_error: 1.1712 - val_loss: 0.9389 - val_root_mean_squared_error: 0.9452\n",
      "\n",
      "Epoch 00001: val_root_mean_squared_error improved from inf to 0.94522, saving model to model_0.hdf5\n",
      "Epoch 2/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0605 - root_mean_squared_error: 1.0281 - val_loss: 0.9543 - val_root_mean_squared_error: 0.9528\n",
      "\n",
      "Epoch 00002: val_root_mean_squared_error did not improve from 0.94522\n",
      "Epoch 3/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0351 - root_mean_squared_error: 1.0156 - val_loss: 0.9339 - val_root_mean_squared_error: 0.9429\n",
      "\n",
      "Epoch 00003: val_root_mean_squared_error improved from 0.94522 to 0.94295, saving model to model_0.hdf5\n",
      "Epoch 4/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0247 - root_mean_squared_error: 1.0105 - val_loss: 0.9159 - val_root_mean_squared_error: 0.9323\n",
      "\n",
      "Epoch 00004: val_root_mean_squared_error improved from 0.94295 to 0.93228, saving model to model_0.hdf5\n",
      "Epoch 5/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0180 - root_mean_squared_error: 1.0072 - val_loss: 0.9091 - val_root_mean_squared_error: 0.9285\n",
      "\n",
      "Epoch 00005: val_root_mean_squared_error improved from 0.93228 to 0.92852, saving model to model_0.hdf5\n",
      "Epoch 6/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0129 - root_mean_squared_error: 1.0047 - val_loss: 0.9001 - val_root_mean_squared_error: 0.9235\n",
      "\n",
      "Epoch 00006: val_root_mean_squared_error improved from 0.92852 to 0.92353, saving model to model_0.hdf5\n",
      "Epoch 7/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0099 - root_mean_squared_error: 1.0032 - val_loss: 0.9335 - val_root_mean_squared_error: 0.9423\n",
      "\n",
      "Epoch 00007: val_root_mean_squared_error did not improve from 0.92353\n",
      "Epoch 8/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0074 - root_mean_squared_error: 1.0019 - val_loss: 0.9286 - val_root_mean_squared_error: 0.9399\n",
      "\n",
      "Epoch 00008: val_root_mean_squared_error did not improve from 0.92353\n",
      "Epoch 9/10\n",
      "13234529/13234529 [==============================] - 109s 8us/step - loss: 1.0050 - root_mean_squared_error: 1.0008 - val_loss: 0.9076 - val_root_mean_squared_error: 0.9276\n",
      "\n",
      "Epoch 00009: val_root_mean_squared_error did not improve from 0.92353\n",
      "Epoch 00009: early stopping\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**************************************************\n",
      "Fold: 1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 13234975 samples, validate on 6617447 samples\n",
      "Epoch 1/10\n",
      "13234975/13234975 [==============================] - 107s 8us/step - loss: 1.2988 - root_mean_squared_error: 1.1253 - val_loss: 0.9897 - val_root_mean_squared_error: 0.9762\n",
      "\n",
      "Epoch 00001: val_root_mean_squared_error improved from inf to 0.97622, saving model to model_1.hdf5\n",
      "Epoch 2/10\n",
      "13234975/13234975 [==============================] - 106s 8us/step - loss: 1.0499 - root_mean_squared_error: 1.0230 - val_loss: 0.9517 - val_root_mean_squared_error: 0.9552\n",
      "\n",
      "Epoch 00002: val_root_mean_squared_error improved from 0.97622 to 0.95523, saving model to model_1.hdf5\n",
      "Epoch 3/10\n",
      "13234975/13234975 [==============================] - 106s 8us/step - loss: 1.0268 - root_mean_squared_error: 1.0116 - val_loss: 0.9970 - val_root_mean_squared_error: 0.9809\n",
      "\n",
      "Epoch 00003: val_root_mean_squared_error did not improve from 0.95523\n",
      "Epoch 4/10\n",
      "13234975/13234975 [==============================] - 106s 8us/step - loss: 1.0169 - root_mean_squared_error: 1.0067 - val_loss: 0.9853 - val_root_mean_squared_error: 0.9744\n",
      "\n",
      "Epoch 00004: val_root_mean_squared_error did not improve from 0.95523\n",
      "Epoch 5/10\n",
      "13234975/13234975 [==============================] - 105s 8us/step - loss: 1.0102 - root_mean_squared_error: 1.0033 - val_loss: 1.0222 - val_root_mean_squared_error: 0.9944\n",
      "\n",
      "Epoch 00005: val_root_mean_squared_error did not improve from 0.95523\n",
      "Epoch 00005: early stopping\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**************************************************\n",
      "Fold: 2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 13235340 samples, validate on 6617082 samples\n",
      "Epoch 1/10\n",
      "13235340/13235340 [==============================] - 106s 8us/step - loss: 1.4815 - root_mean_squared_error: 1.1747 - val_loss: 0.9442 - val_root_mean_squared_error: 0.9473\n",
      "\n",
      "Epoch 00001: val_root_mean_squared_error improved from inf to 0.94727, saving model to model_2.hdf5\n",
      "Epoch 2/10\n",
      "13235340/13235340 [==============================] - 102s 8us/step - loss: 1.0550 - root_mean_squared_error: 1.0254 - val_loss: 0.9456 - val_root_mean_squared_error: 0.9477\n",
      "\n",
      "Epoch 00002: val_root_mean_squared_error did not improve from 0.94727\n",
      "Epoch 3/10\n",
      "13235340/13235340 [==============================] - 102s 8us/step - loss: 1.0277 - root_mean_squared_error: 1.0120 - val_loss: 0.9343 - val_root_mean_squared_error: 0.9429\n",
      "\n",
      "Epoch 00003: val_root_mean_squared_error improved from 0.94727 to 0.94288, saving model to model_2.hdf5\n",
      "Epoch 4/10\n",
      "13235340/13235340 [==============================] - 102s 8us/step - loss: 1.0165 - root_mean_squared_error: 1.0065 - val_loss: 0.9282 - val_root_mean_squared_error: 0.9399\n",
      "\n",
      "Epoch 00004: val_root_mean_squared_error improved from 0.94288 to 0.93989, saving model to model_2.hdf5\n",
      "Epoch 5/10\n",
      "13235340/13235340 [==============================] - 102s 8us/step - loss: 1.0100 - root_mean_squared_error: 1.0032 - val_loss: 0.9309 - val_root_mean_squared_error: 0.9411\n",
      "\n",
      "Epoch 00005: val_root_mean_squared_error did not improve from 0.93989\n",
      "Epoch 6/10\n",
      "13235340/13235340 [==============================] - 103s 8us/step - loss: 1.0055 - root_mean_squared_error: 1.0009 - val_loss: 0.9419 - val_root_mean_squared_error: 0.9471\n",
      "\n",
      "Epoch 00006: val_root_mean_squared_error did not improve from 0.93989\n",
      "Epoch 7/10\n",
      "13235340/13235340 [==============================] - 102s 8us/step - loss: 1.0020 - root_mean_squared_error: 0.9992 - val_loss: 0.9501 - val_root_mean_squared_error: 0.9507\n",
      "\n",
      "Epoch 00007: val_root_mean_squared_error did not improve from 0.93989\n",
      "Epoch 00007: early stopping\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "**************************************************\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import KFold, StratifiedKFold\n",
    "\n",
    "oof = np.zeros(len(features))\n",
    "batch_size = 1024\n",
    "epochs = 10\n",
    "models = []\n",
    "\n",
    "folds = 3\n",
    "seed = 666\n",
    "\n",
    "kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)\n",
    "\n",
    "for fold_n, (train_index, valid_index) in enumerate(kf.split(features, features['building_id'])):\n",
    "    print('Fold:', fold_n)\n",
    "    X_train, X_valid = features.iloc[train_index], features.iloc[valid_index]\n",
    "    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]\n",
    "    X_t = get_keras_data(X_train, numericals, categoricals)\n",
    "    X_v = get_keras_data(X_valid, numericals, categoricals)\n",
    "    \n",
    "    keras_model = model(dense_dim_1=64, dense_dim_2=32, dense_dim_3=32, dense_dim_4=16, \n",
    "                        dropout1=0.2, dropout2=0.1, dropout3=0.1, dropout4=0.1, lr=0.001)\n",
    "    mod = train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold_n, patience=3)\n",
    "    models.append(mod)\n",
    "    print('*'* 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.93955"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(0.92353+0.95523+0.93989)/3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of properties dataframe is : 4771.9117431640625  MB\n",
      "******************************\n",
      "Column:  building_id\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  1448\n",
      "dtype after:  uint16\n",
      "******************************\n",
      "******************************\n",
      "Column:  meter\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  3\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "******************************\n",
      "Column:  site_id\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  15\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "******************************\n",
      "Column:  square_feet\n",
      "dtype before:  int64\n",
      "min for this col:  283\n",
      "max for this col:  875000\n",
      "dtype after:  uint32\n",
      "******************************\n",
      "******************************\n",
      "Column:  year_built\n",
      "dtype before:  float64\n",
      "min for this col:  1900.0\n",
      "max for this col:  2017.0\n",
      "dtype after:  uint16\n",
      "******************************\n",
      "******************************\n",
      "Column:  floor_count\n",
      "dtype before:  float64\n",
      "min for this col:  1.0\n",
      "max for this col:  26.0\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "******************************\n",
      "Column:  air_temperature\n",
      "dtype before:  float64\n",
      "min for this col:  -28.1\n",
      "max for this col:  48.3\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  cloud_coverage\n",
      "dtype before:  float64\n",
      "min for this col:  0.0\n",
      "max for this col:  9.0\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  dew_temperature\n",
      "dtype before:  float64\n",
      "min for this col:  -31.6\n",
      "max for this col:  26.7\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  precip_depth_1_hr\n",
      "dtype before:  float64\n",
      "min for this col:  -1.0\n",
      "max for this col:  597.0\n",
      "dtype after:  float32\n",
      "******************************\n",
      "******************************\n",
      "Column:  is_holiday\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  1\n",
      "dtype after:  uint8\n",
      "******************************\n",
      "******************************\n",
      "Column:  row_id\n",
      "dtype before:  int64\n",
      "min for this col:  0\n",
      "max for this col:  41697599\n",
      "dtype after:  uint32\n",
      "******************************\n",
      "___MEMORY USAGE AFTER COMPLETION:___\n",
      "Memory usage is:  2226.8922119140625  MB\n",
      "This is  46.6666680309871 % of the initial size\n"
     ]
    }
   ],
   "source": [
    "test_feature = pd.read_csv('../Large_output/test_merge.csv')\n",
    "test_feature, NA_list = reduce_mem_usage(test_feature)\n",
    "test_feature = features_engineering(test_feature)\n",
    "test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday', 'row_id']]\n",
    "row_ids = test_feature[['row_id']]\n",
    "test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 834/834 [04:37<00:00,  3.01it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "i=0\n",
    "res = np.zeros((test_feature.shape[0]),dtype=np.float32)\n",
    "step_size = 50000\n",
    "for j in tqdm(range(int(np.ceil(test_feature.shape[0]/step_size)))):\n",
    "    for_prediction = get_keras_data(test_feature.iloc[i:i+step_size], numericals, categoricals)\n",
    "    res[i:min(i+step_size,test_feature.shape[0])] = \\\n",
    "       np.expm1(sum([model.predict(for_prediction, batch_size=1024)[:,0] for model in models])/folds)\n",
    "    i+=step_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_feature['meter_reading']=np.clip(res, 0, a_max=None)\n",
    "test_feature.loc[(test_feature['site_id']==0) & \n",
    "                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &\n",
    "                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)\n",
    "df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})\n",
    "df_result.to_csv('../Large_output/nn_first.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(41697600, 2)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_result.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
